The E. coli scheme from Enterobase [1] was used with chewBBACA [2] to get the cgMLST profile for each isolate. The number of missing loci in total (sum of LNF, NIPH, PLOT, ALM and ASM from chewBBACA) is plotted below for all isolates.
# Vector of excluded samples
ex_samples <- c(
"2016-17-565-1-S",
"2016-02-428-2-S",
"2016-02-486-2-S",
"2016-02-732-2-S",
"2014-01-1741-1-S"
)
# Collapses data frame to unique lines while ignoring NA's
func_paste <- function(x) paste(unique(x[!is.na(x)]), collapse = ", ")
"%not_in%" <- Negate("%in%")
# Isolate data
id_data <- read.table("../data/id.txt",
sep = "\t",
header = TRUE,
stringsAsFactors = F)
isolate_data <- read.table("../data/isolate_data.txt",
sep = "\t",
header = T,
stringsAsFactors = F) %>%
filter(id %not_in% ex_samples)
# cgMLST data
cgmlst_stats <- read.table("../data/chewbbaca/complete_results/mdata_stats.tsv",
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE)
detailed_cgmlst_stats <- read.table("../data/chewbbaca/results_statistics.tsv",
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE)
cgMLST_allele_data <- read.table("../data/chewbbaca/complete_results/cgMLST.tsv",
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE) %>%
mutate(FILE = sub("_pilon_spades.fasta", "", FILE)) %>%
dplyr::rename("ref" = FILE) %>%
left_join(id_data, by = "ref") %>%
select(id, everything(), -ref) %>%
filter(id %not_in% ex_samples)boxplot(cgmlst_stats$number.of.missing.data,
ylab = "Number of missing loci")Below follows the detailed results for all loci in the cgMLST scheme. EXC = excact matches, INF = novel allele, LNF = locus not found, NIPH = paralogous locus, PLOT = possible loci on tip of contig, ALM = allele 20 % larger than reference, ASM = allele 20 % smaller than reference. The numbers below each boxplot denote the mean.
test <- detailed_cgmlst_stats %>%
gather(metric, value, -Genome) %>%
group_by(metric) %>%
mutate(mean_val = round(mean(value), 1))
ggplot(test, aes(metric, value)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot() +
geom_text(data = subset(test,
Genome == "2006-01-2184_S30_pilon_spades.fasta"),
aes(label = mean_val),
y = -60,
size = 3.8) +
scale_x_discrete(limits = c("EXC",
"INF",
"LNF",
"NIPH",
"PLOT",
"ALM",
"ASM")) +
labs(y = "Number of loci") +
theme(axis.title.x = element_blank(),
axis.ticks.x = element_blank())Genetic distances were calculated from the allele data from the cgMLST report, and a dendrogram was drawn from these distances. The annotations are based on data from the ARIBA resistance gene analysis.
cgMLST_data <- read.table("../data/chewbbaca/complete_results/clean_cgMLST_data.txt",
sep = "\t",
header = TRUE,
colClasses = "factor")
total_tree <- as.phylo(hclust(daisy(cgMLST_data, metric = "gower"), method = "average"))
tree_metadata <- read.table("../data/chewbbaca/complete_results/tree_metadata.txt",
sep = "\t",
header = TRUE)
tree_heatmap_data <- read.table("../data/chewbbaca/complete_results/tree_heatmap_data.txt",
sep = "\t",
header = TRUE) %>%
select(-gadX)
palette <- c("Broiler" = "#4575b4",
"Pig" = "#74add1",
"Red fox" = "#f46d43",
"Wild bird" = "#fdae61")
palette2 <- c("1-A" = "#fc8d59",
"1-I" = "#80b1d3",
"0" = "grey95")
total_tree_annotated <- ggtree(total_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(aes(color = species),
size = 3) +
geom_tiplab2(aes(label = ST),
offset = 0.01,
size = 4) +
scale_color_manual(values = palette)
total_tree_opened <- rotate_tree(open_tree(total_tree_annotated, 8), 93)
gheatmap(total_tree_opened,
tree_heatmap_data,
offset = 0.04,
width = 0.3,
colnames_offset_y = 2.5,
colnames_position = "top",
font.size = 5) +
scale_fill_manual(values = palette2,
labels = c("Gene/Mutation absent",
"Acquired gene present",
"Intrinsic gene mutated")) +
theme(legend.position = c(0.9,0.9),
legend.text = element_text(size = 20))metadata_species <- read.table("../data/chewbbaca/complete_results/metadata_species_specific_trees.txt",
sep = "\t",
header = TRUE)
palette3 <- c("0" = "grey95",
# gyrA
"S83L" = "#c6dbef",
"S83A" = "#9ecae1",
"D87N" = "#6baed6",
"D87H" = "#4292c6",
"D87Y" = "#2171b5",
"D87G" = "#08519c",
"S83L, D87N" = "#08306b",
# parC
"S80I" = "#a1d99b",
"S80R" = "#74c476",
"S57T" = "#41ab5d",
"A56T, S80I" = "#238b45",
"S80I, E84V" = "#005a32",
# parE
"D475E" = "#fdd0a2",
"D463N" = "#fdae6b",
"S458A" = "#fd8d3c",
"L416F" = "#f16913",
"H516Y" = "#d94801",
"L488M, A512T" = "#8c2d04",
# PMQR
"qnrA1" = "#762a83",
"qnrB19" = "#ef8a62",
"qnrS1" = "#a6dba0",
"qnrS2" = "#5aae61",
"qnrS4" = "#1b7837",
"qepA4" = "#80cdc1",
# num
"1" = "#440154FF",
"2" = "#3B528BFF",
"3" = "#21908CFF",
"4" = "#5DC863FF",
"5" = "#FDE725FF")
# Broiler tree
broiler_cgMLST <- read.table("../data/chewbbaca/complete_results/broiler_cgmlst.txt",
sep = "\t",
header = TRUE,
colClasses = "factor")
broiler_tree <- as.phylo(hclust(daisy(broiler_cgMLST, metric = "gower"), method = "average"))
broiler_tree_annotated <- ggtree(broiler_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#4575b4",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.01,
size = 6)
broiler_tree_opened <- rotate_tree(open_tree(broiler_tree_annotated, 10), 92.5)
broiler_complete <- gheatmap(broiler_tree_opened,
metadata_species,
offset = 0.1,
width = 0.5,
colnames_offset_y = 0.75,
colnames_position = "top",
font.size = 5) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Pig tree
pig_cgMLST <- read.table("../data/chewbbaca/complete_results/pig_cgmlst.txt",
sep = "\t",
header = TRUE,
colClasses = "factor")
pig_tree <- as.phylo(hclust(daisy(pig_cgMLST, metric = "gower"), method = "average"))
pig_tree_annotated <- ggtree(pig_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#74add1",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.01,
size = 5.5)
pig_tree_opened <- rotate_tree(open_tree(pig_tree_annotated, 10), 92.5)
pig_complete <- gheatmap(pig_tree_opened,
metadata_species,
offset = 0.09,
width = 0.5,
colnames_offset_y = 0.53,
colnames_position = "top",
font.size = 5) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Red fox tree
fox_cgMLST <- read.table("../data/chewbbaca/complete_results/fox_cgmlst.txt",
sep = "\t",
header = TRUE,
colClasses = "factor")
fox_tree <- as.phylo(hclust(daisy(fox_cgMLST, metric = "gower"), method = "average"))
fox_tree_annotated <- ggtree(fox_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#f46d43",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.01,
size = 5.5)
fox_tree_opened <- rotate_tree(open_tree(fox_tree_annotated, 10), 92)
fox_complete <- gheatmap(fox_tree_opened,
metadata_species,
offset = 0.09,
width = 0.5,
colnames_offset_y = 0.17,
colnames_position = "top",
font.size = 5) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Wild bird tree
bird_cgMLST <- read.table("../data/chewbbaca/complete_results/bird_cgmlst.txt",
sep = "\t",
header = TRUE,
colClasses = "factor")
bird_tree <- as.phylo(hclust(daisy(bird_cgMLST, metric = "gower"), method = "average"))
bird_tree_annotated <- ggtree(bird_tree,
layout = "circular",
color = "#808080",
size = 0.5) %<+% tree_metadata +
geom_tippoint(color = "#fdae61",
size = 4) +
geom_tiplab2(aes(label = ST),
offset = 0.01,
size = 5.5)
bird_tree_opened <- rotate_tree(open_tree(bird_tree_annotated, 10), 92.5)
bird_complete <- gheatmap(bird_tree_opened,
metadata_species,
offset = 0.09,
width = 0.5,
colnames_offset_y = 0.38,
colnames_position = "top",
font.size = 5) +
scale_fill_manual(values = palette3) +
guides(fill = FALSE)
# Total plot
tot_plot <- plot_grid(broiler_complete,
pig_complete,
fox_complete,
bird_complete,
nrow = 2,
ncol = 2,
align = "h")
include_graphics("../figures/species_dendrogram.png")Below follows a plot of the distribution of the lower triangular distances for each animal species. A bimodal distribution is present in all four plots. The filtering values of < 0.1 and > 0.8 is used for downstream statistics to acertain whether the broiler isolates are more homogenous than the isolates from other species.
# calculates distances from cgMLST data and returns
# the lower triangular of the matrix, with (diag = FALSE)
# or without (diag = TRUE) the diagonal
lower_tri_dist_calc <- function(data, diag = TRUE, data_frame = FALSE) {
dist <- as.matrix(daisy(data, metric = "gower"))
if (diag == TRUE) {
dist[upper.tri(dist, diag = TRUE)] <- NA
} else {
dist[upper.tri(dist, diag = FALSE)] <- NA
}
dist_vec <- as.vector(as.matrix(dist))
dist_compl <- dist_vec[!is.na(dist_vec)]
if (data_frame == TRUE) {
dist_compl <- as.data.frame(as.matrix(dist))
}
return(dist_compl)
}
broiler_dist <- lower_tri_dist_calc(broiler_cgMLST)
pig_dist <- lower_tri_dist_calc(pig_cgMLST)
fox_dist <- lower_tri_dist_calc(fox_cgMLST)
bird_dist <- lower_tri_dist_calc(bird_cgMLST)
# Distance plots
par(mfrow = c(2,2))
x1 <- hist(broiler_dist,
breaks = 20,
plot = FALSE)
x1$density = x1$counts/sum(x1$counts)*100
plot(x1,
freq = FALSE,
main = "Broiler distance frequency",
xlab = "Distance",
ylim = c(0, 50))
x2 <- hist(pig_dist,
breaks = 20,
plot = FALSE)
x2$density = x2$counts/sum(x2$counts)*100
plot(x2,
freq = FALSE,
main = "Pig distance frequency",
xlab = "Distance",
ylim = c(0, 50))
x3 <- hist(fox_dist,
breaks = 20,
plot = FALSE)
x3$density = x3$counts/sum(x3$counts)*100
plot(x3,
freq = FALSE,
main = "Red fox distance frequency",
xlab = "Distance",
ylim = c(0, 50))
x4 <- hist(bird_dist,
breaks = 20,
plot = FALSE)
x4$density = x4$counts/sum(x4$counts)*100
plot(x4,
freq = FALSE,
main = "Wild bird distance frequency",
xlab = "Distance",
ylim = c(0, 50))Distribution of percentages of distances < 0.1 or > 0.8. The distribution is based on 1000 iterations of the distance data. Observed values are denoted as arrows for each animal species; dark blue = Broiler, light blue = Pig, red = Red fox, yellow = Wild bird. The mean value is denoted as a vertical black line.
# generate data frame from allele data
alleles <- cgMLST_allele_data %>%
left_join(isolate_data[,c("id","species")], by = "id") %>%
group_by(species) %>%
mutate(n = 1:n(),
new_id = case_when(species == "Broiler" ~ paste("BR", n, sep = "_"),
species == "Pig" ~ paste("PI", n, sep = "_"),
species == "Red fox" ~ paste("RF", n, sep = "_"),
species == "Wild bird" ~ paste("WB", n, sep = "_"))) %>%
ungroup() %>%
select(new_id, everything(), -c(species, id)) %>%
mutate_all(funs(as.factor)) %>%
column_to_rownames("new_id")
# generate data frame for iteration analysis
run_df <- lower_tri_dist_calc(alleles, data_frame = TRUE) %>%
rownames_to_column("id") %>%
gather(isolate1, value, -id) %>%
dplyr::rename("isolate2" = id) %>%
mutate(group1 = substr(isolate1, start = 1, stop = 2),
group2 = substr(isolate2, start = 1, stop = 2)) %>%
select(isolate1, group1, isolate2, group2, value) %>%
filter(is.na(value) == FALSE,
group1 == group2) %>%
arrange(group1, group2)
# functions used in iteration analyses
## Calculates the sum of observations lower than dist_value1 and
## bigger than dist_value2, and calculates the percentage of
## observed values for each group
without <- function(df, run, dist_value1 = 0.1, dist_value2 = 0.8) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value < dist_value1 | value > dist_value2) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Within", "Without")) %>%
spread(n, nn) %>%
mutate(Percent = round(Within / (Within + Without) * 100, 2),
iter = run)
return(df)
}
## Calculates the sum of observations between dist_value1 and dist_value2
## and calculates the percentage of observed values for each group
within <- function(df, run, dist_value1 = 0.1, dist_value2 = 0.8) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value > dist_value1 & value < dist_value2) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Within", "Without")) %>%
spread(n, nn) %>%
mutate(Percent = round(Within / (Within + Without) * 100, 2),
iter = run)
return(df)
}
## Calculates the sum of observations below dist_value1
## for each group
below_calc <- function(df, run, dist_value1 = 0.1) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value < dist_value1) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Below", "Above")) %>%
spread(n, nn) %>%
mutate(Below = ifelse(is.na(Below) == TRUE, 0, Below)) %>%
mutate(Percent_below = round(Below / (Above + Below) * 100, 2),
iter = run)
return(df)
}
## Calculates the sum of observations above dist_value1
## for each group
above_calc <- function(df, run, dist_value1 = 0.9) {
df <- df %>%
group_by(group1, group2) %>%
mutate(n = value > dist_value1) %>%
group_by(group1, group2, n) %>%
count() %>%
ungroup() %>%
mutate(n = ifelse(n == TRUE, "Above", "Below")) %>%
spread(n, nn) %>%
mutate(Above = ifelse(is.na(Above) == TRUE, 0, Above)) %>%
mutate(Percent_above = round(Above / (Above + Below) * 100, 2),
iter = run)
return(df)
}
## Calculates the mean of observations for each group
mean_calc <- function(df, run) {
df <- df %>%
group_by(group1, group2) %>%
mutate(metric = round(mean(value), 2)) %>%
select(group1, group2, metric) %>%
summarise_all(funs(func_paste)) %>%
mutate(iter = run,
metric = as.numeric(metric))
}
## Calculates the median of observations for each group
median_calc <- function(df, run) {
df <- df %>%
group_by(group1, group2) %>%
mutate(metric = round(median(value), 2)) %>%
select(group1, group2, metric) %>%
summarise_all(funs(func_paste)) %>%
mutate(iter = run,
metric = as.numeric(metric))
}
## Calculates the quantiles of the observations for each group
quantile_calc <- function(df, run) {
df <- df %>%
group_by(group1, group2) %>%
mutate(quant_0 = quantile(value)[1],
quant_25 = quantile(value)[2],
quant_50 = quantile(value)[3],
quant_75 = quantile(value)[4],
quant_100 = quantile(value)[5]) %>%
select(group1, group2, contains("quant")) %>%
summarise_all(funs(func_paste)) %>%
mutate_at(vars(contains("quant")),
funs(as.numeric)) %>%
mutate(iter = run)
}
# Iteration function
iterate_samples <- function(df, run, method = "mean", dist_value1, dist_value2) {
# randomize the value column, regardless of groups
df <- df %>%
mutate(value = sample(value))
# run specified function
if (method == "mean") {
df <- mean_calc(df, run)
}
if (method == "below") {
df <- below_calc(df, run, dist_value1)
}
if (method == "above") {
df <- above_calc(df, run, dist_value1)
}
if (method == "within") {
df <- within(df, run, dist_value1, dist_value2)
}
if (method == "without") {
df <- without(df, run, dist_value1, dist_value2)
}
if (method == "median") {
df <- median_calc(df, run)
}
if (method == "quantile") {
df <- quantile_calc(df, run)
}
return(df)
}
# run iterations on selected data frame
run_iterations <- function(df, runs = 1000, seed = 10, method = "mean", dist_value1, dist_value2) {
# set seed
set.seed(seed)
# create output list
output <- list()
# create expected values
if (method == "mean") {
orig <- mean_calc(df, 0)
}
if (method == "below") {
orig <- below_calc(df, 0, dist_value1)
}
if (method == "above") {
orig <- above_calc(df, 0, dist_value1)
}
if (method == "within") {
orig <- within(df, 0, dist_value1, dist_value2)
}
if (method == "without") {
orig <- without(df, 0, dist_value1, dist_value2)
}
if (method == "median") {
orig <- median_calc(df, 0)
}
if (method == "quantile") {
orig <- quantile_calc(df, 0)
}
output <- c(output, list(orig))
# run iterations
for (i in 1:runs) {
result <- iterate_samples(df,
i,
method = method,
dist_value1 = dist_value1,
dist_value2 = dist_value2)
output <- c(output, list(result))
}
return(output)
}
# Run functions
summary_stats <- run_df %>%
mutate(total_mean = round(mean(value)*100, 3)) %>%
group_by(group1) %>%
mutate(mean = round(mean(value)*100, 3),
var = round(var(value)*100, 3)) %>%
select(group1, total_mean, mean, var) %>%
summarise_all(funs(func_paste))
summary_stats2 <- run_df %>%
mutate(total_mean = round(mean(value), 3)) %>%
group_by(group1) %>%
mutate(mean = round(mean(value), 3),
var = round(var(value), 3)) %>%
select(group1, total_mean, mean, var) %>%
summarise_all(funs(func_paste))
complete_data <- run_iterations(run_df,
runs = 1000,
method = "without",
dist_value1 = 0.1,
dist_value2 = 0.8) %>%
bind_rows() %>%
mutate(Percent = as.numeric(Percent))
lower_data <- run_iterations(run_df,
runs = 1000,
method = "below",
dist_value1 = 0.1) %>%
bind_rows() %>%
mutate(Percent_below = as.numeric(Percent_below))
# Plots
ggplot(complete_data, aes(Percent)) +
geom_histogram(aes(y = ..density..), binwidth = 0.2, color = "black",
fill = "grey80") +
stat_function(fun = dnorm,
args = with(complete_data, c(mean = mean(Percent), sd = sd(Percent))),
color = "red") +
geom_segment(aes(x = complete_data$Percent[1],
xend = complete_data$Percent[1], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#4575b4") +
geom_segment(aes(x = complete_data$Percent[2],
xend = complete_data$Percent[2], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#74add1") +
geom_segment(aes(x = complete_data$Percent[3],
xend = complete_data$Percent[3], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#f46d43") +
geom_segment(aes(x = complete_data$Percent[4],
xend = complete_data$Percent[4], y = 0.08, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#fdae61") +
geom_text(label = "p < 0.001",
x = 83.30,
y = 0.16,
angle = 90) +
geom_vline(xintercept = as.numeric(summary_stats$mean[1]),
size = 1,
color = "#4575b4") +
geom_vline(xintercept = as.numeric(summary_stats$mean[2]),
size = 1,
color = "#74add1") +
geom_vline(xintercept = as.numeric(summary_stats$mean[3]),
size = 1,
color = "#f46d43") +
geom_vline(xintercept = as.numeric(summary_stats$mean[4]),
size = 1,
color = "#fdae61") +
labs(x = "Percent (%) distances < 0.1 or > 0.8",
y = "Density") +
ggtitle(label = "Distribution of percentages",
subtitle = "Number of iterations: 1000") +
theme(plot.title = element_text(hjust = 0))ggplot(lower_data, aes(Percent_below)) +
geom_histogram(aes(y = ..density..), binwidth = 0.2, color = "black",
fill = "grey80") +
stat_function(fun = dnorm,
args = with(lower_data, c(mean = mean(Percent_below), sd = sd(Percent_below))),
color = "red") +
geom_segment(aes(x = lower_data$Percent_below[1],
xend = lower_data$Percent_below[1], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#4575b4") +
geom_segment(aes(x = lower_data$Percent_below[2],
xend = lower_data$Percent_below[2], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#74add1") +
geom_segment(aes(x = lower_data$Percent_below[3],
xend = lower_data$Percent_below[3], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#f46d43") +
geom_segment(aes(x = lower_data$Percent_below[4],
xend = lower_data$Percent_below[4], y = 0.15, yend = 0),
arrow = arrow(length = unit(0.3, "cm"),
type = "closed"),
color = "#fdae61") +
geom_text(label = "p < 0.001",
x = 5.87,
y = 0.32,
angle = 90) +
geom_vline(xintercept = mean(lower_data$Percent_below),
size = 1) +
labs(x = "Percent (%) distances < 0.1",
y = "Density") +
ggtitle(label = "Distribution of Percentages",
subtitle = "Number of iterations: 1000") +
theme(plot.title = element_text(hjust = 0))summary_stats2[1] Alikhan N-F, Zhou Z, Sergeant MJ, Achtman M. A genomic overview of the population structure of Salmonella. Casadesús J, editor. PLOS Genet [Internet]. 2018 Apr 5;14(4):e1007261. Available from: https://dx.plos.org/10.1371/journal.pgen.1007261
[2] Machado MP, Silva M, Carriço JA, Silva DN, Santos S, Ramirez M, et al. chewBBACA: A complete suite for gene-by-gene schema creation and strain identification. Microb Genomics. 2018;1–7.